import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrixLab 8
Lab 8: Linear Classifiers
data = pd.read_csv("/Users/ben/Documents/GitHub/DSML/Data/cannabis_full.csv")
data = data.dropna()
data.head()| Strain | Type | Rating | Effects | Flavor | Creative | Energetic | Tingly | Euphoric | Relaxed | ... | Ammonia | Minty | Tree | Fruit | Butter | Pineapple | Tar | Rose | Plum | Pear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100-Og | hybrid | 4.0 | Creative,Energetic,Tingly,Euphoric,Relaxed | Earthy,Sweet,Citrus | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 98-White-Widow | hybrid | 4.7 | Relaxed,Aroused,Creative,Happy,Energetic | Flowery,Violet,Diesel | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 1024 | sativa | 4.4 | Uplifted,Happy,Relaxed,Energetic,Creative | Spicy/Herbal,Sage,Woody | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 13-Dawgs | hybrid | 4.2 | Tingly,Creative,Hungry,Relaxed,Uplifted | Apricot,Citrus,Grapefruit | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 24K-Gold | hybrid | 4.6 | Happy,Relaxed,Euphoric,Uplifted,Talkative | Citrus,Earthy,Orange | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 69 columns
Part One: Binary Classification
is_data = data[(data["Type"] == "sativa" )| (data["Type"] == "indica")]
is_data.head()| Strain | Type | Rating | Effects | Flavor | Creative | Energetic | Tingly | Euphoric | Relaxed | ... | Ammonia | Minty | Tree | Fruit | Butter | Pineapple | Tar | Rose | Plum | Pear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1024 | sativa | 4.4 | Uplifted,Happy,Relaxed,Energetic,Creative | Spicy/Herbal,Sage,Woody | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 3-Bears-Og | indica | 0.0 | None | None | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 303-Og | indica | 4.2 | Relaxed,Happy,Euphoric,Uplifted,Giggly | Citrus,Pungent,Earthy | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8 | 3D-Cbd | sativa | 4.6 | Uplifted,Focused,Happy,Talkative,Relaxed | Earthy,Woody,Flowery | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 3X-Crazy | indica | 4.4 | Relaxed,Tingly,Happy,Euphoric,Uplifted | Earthy,Grape,Sweet | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 69 columns
X = is_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = is_data["Type"]
y = LabelEncoder().fit_transform(y)Q1: LDA
As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.
ct = ColumnTransformer(
[
("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
]
)
my_pipeline = Pipeline([
("Preprocessing", ct),
("LDA", LinearDiscriminantAnalysis(solver = "eigen"))
])
alphas = {"LDA__shrinkage": [.0001, .001, .01, .1, 1]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e82b290>)])),
('LDA',
LinearDiscriminantAnalysis(shrinkage=1, solver='eigen'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e82b290>)])),
('LDA',
LinearDiscriminantAnalysis(shrinkage=1, solver='eigen'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e82b290>)])<sklearn.compose._column_transformer.make_column_selector object at 0x15e82b290>
StandardScaler()
LinearDiscriminantAnalysis(shrinkage=1, solver='eigen')
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")Average F1 Score across 25 cross validations: 0.7895754166409029
This is a score that is better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LDA", LinearDiscriminantAnalysis(solver = "eigen", shrinkage = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 605 | 82 |
| Actual Sativa | 73 | 358 |
Q2: QDA
As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.
my_pipeline = Pipeline([
("Preprocessing", ct),
("QDA", QuadraticDiscriminantAnalysis())
])
alphas = {"QDA__reg_param": [0, .0001, .001, .01, .1, 1]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15d9eec10>)])),
('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15d9eec10>)])),
('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15d9eec10>)])<sklearn.compose._column_transformer.make_column_selector object at 0x15d9eec10>
StandardScaler()
QuadraticDiscriminantAnalysis(reg_param=1)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 30 cross validations: 0.67734658323323
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("QDA", QuadraticDiscriminantAnalysis(reg_param = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 607 | 80 |
| Actual Sativa | 73 | 358 |
Q3: SVC
As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e8c1bd0>)])),
('SVC', SVC(C=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e8c1bd0>)])),
('SVC', SVC(C=10))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x15e8c1bd0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x15e8c1bd0>
StandardScaler()
SVC(C=10)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")Average F1 Score across 25 cross validations: 0.7222126650132197
This is a score that is better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 10))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 676 | 11 |
| Actual Sativa | 26 | 405 |
Q4: SVM
As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVM", SVC(kernel="poly"))
])
alphas = {"SVM__C": [.1, 1, 10, 100, 1000], "SVM__degree": list(range(1, 11))}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a32842d0>)])),
('SVM', SVC(C=100, degree=1, kernel='poly'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a32842d0>)])),
('SVM', SVC(C=100, degree=1, kernel='poly'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a32842d0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x1a32842d0>
StandardScaler()
SVC(C=100, degree=1, kernel='poly')
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is about the slightly worse than random guesses.")Average F1 Score across 250 cross validations: 0.42109205230492136
This is a score that is about the slightly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVM", SVC(kernel = "poly", C = 100, degree = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 633 | 54 |
| Actual Sativa | 77 | 354 |
Part Two: Natural Multiclass
X = data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = data["Type"]
y = LabelEncoder().fit_transform(y)Q1: Decison Tree
my_pipeline = Pipeline([
("Preprocessing", ct),
("DTree", DecisionTreeClassifier())
])
alphas = {"DTree__ccp_alpha": [0, .00001, .0001, .001, .01, .1, 1, 10]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168287090>)])),
('DTree', DecisionTreeClassifier(ccp_alpha=0.001))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168287090>)])),
('DTree', DecisionTreeClassifier(ccp_alpha=0.001))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168287090>)])<sklearn.compose._column_transformer.make_column_selector object at 0x168287090>
StandardScaler()
DecisionTreeClassifier(ccp_alpha=0.001)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")Average F1 Score across 40 cross validations: 0.3854219815972776
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("DTree", DecisionTreeClassifier(ccp_alpha=.001))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Hybrid | Predicted Indica | Predicted Sativa | |
|---|---|---|---|
| Actual Hybrid | 887 | 178 | 122 |
| Actual Indica | 242 | 433 | 12 |
| Actual Sativa | 212 | 18 | 201 |
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(200,100))
plot_tree(DecisionTreeClassifier(ccp_alpha=.001).fit(X, y), filled=True, feature_names=list(X.columns), class_names=["Hybrid", "Indica", "Sativa"])
plt.show()Examining the decision tree provides a very interesting way of understanding how the model is attempting to classify. To begin it discovered that the most common difference between sativa and indica is that one is more likely to make a person sleepy than the other. This initial split results in only 28 sativa strains being left on the right side of the tree, with the rest being pushed to the left side of the tree. From here the logic of the tree changes for the two sides. On the right side of the tree, the model attempts to use flavors to classify, while on the left side of the tree the model focuses on how the strain makes people feel. This methodology seems to allow the model to weed out the sativa strains and focus on the classification between indica and hybrid.
Q2: LDA, QDA, and KNN
# LDA
my_pipeline = Pipeline([
("Preprocessing", ct),
("LDA", LinearDiscriminantAnalysis(solver = "eigen"))
])
alphas = {"LDA__shrinkage": [.0001, .001, .01, .1, 1]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16825f310>)])),
('LDA',
LinearDiscriminantAnalysis(shrinkage=0.1, solver='eigen'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16825f310>)])),
('LDA',
LinearDiscriminantAnalysis(shrinkage=0.1, solver='eigen'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16825f310>)])<sklearn.compose._column_transformer.make_column_selector object at 0x16825f310>
StandardScaler()
LinearDiscriminantAnalysis(shrinkage=0.1, solver='eigen')
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 25 cross validations: 0.5942931559991004
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LDA", LinearDiscriminantAnalysis(solver = "eigen", shrinkage = .1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Hybrid | Predicted Indica | Predicted Sativa | |
|---|---|---|---|
| Actual Hybrid | 835 | 206 | 146 |
| Actual Indica | 207 | 468 | 12 |
| Actual Sativa | 222 | 20 | 189 |
# QDA
my_pipeline = Pipeline([
("Preprocessing", ct),
("QDA", QuadraticDiscriminantAnalysis())
])
alphas = {"QDA__reg_param": [0, .0001, .001, .01, .1, 1]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2e53d50>)])),
('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2e53d50>)])),
('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2e53d50>)])<sklearn.compose._column_transformer.make_column_selector object at 0x1a2e53d50>
StandardScaler()
QuadraticDiscriminantAnalysis(reg_param=1)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is worse than random guesses.")Average F1 Score across 30 cross validations: 0.39560132926837605
This is a score that is worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("QDA", QuadraticDiscriminantAnalysis(reg_param = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Hybrid | Predicted Indica | Predicted Sativa | |
|---|---|---|---|
| Actual Hybrid | 796 | 213 | 178 |
| Actual Indica | 198 | 469 | 20 |
| Actual Sativa | 197 | 18 | 216 |
# KNN
my_pipeline = Pipeline([
("Preprocessing", ct),
("KNN", KNeighborsClassifier())
])
alphas = {"KNN__n_neighbors": list(range(1, 50))}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1671dbbd0>)])),
('KNN', KNeighborsClassifier(n_neighbors=3))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1671dbbd0>)])),
('KNN', KNeighborsClassifier(n_neighbors=3))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1671dbbd0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x1671dbbd0>
StandardScaler()
KNeighborsClassifier(n_neighbors=3)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")Average F1 Score across 245 cross validations: 0.4400240703580052
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("KNN", KNeighborsClassifier(n_neighbors=3))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Hybrid | Predicted Indica | Predicted Sativa | |
|---|---|---|---|
| Actual Hybrid | 1003 | 117 | 67 |
| Actual Indica | 215 | 464 | 8 |
| Actual Sativa | 180 | 18 | 233 |
Q3
My metrics were significantly worse than in part one. This is because adding a third, difficult to distinguish category resulted in the model having lower accuracy and being confused more often about which type each strain belonged to. The category that was most likely to get mixed up according to the confusion matrices was Hybrid. The models often correctly identified the hybrid strains, but also would overguess and incorrectly categorize many indica and sativa strains as hybrid. This is because hybrid strains share many characteristics with both sativa and indica.
Part Three: Multiclass from Binary
Q1
# Indica vs. Not Indica
y = np.where(data["Type"] == "indica", 1, 0)# SVC
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16819acd0>)])),
('SVC', SVC(C=1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16819acd0>)])),
('SVC', SVC(C=1))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x16819acd0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x16819acd0>
StandardScaler()
SVC(C=1)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 25 cross validations: 0.53992997672384
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Indica'], columns=['Predicted Other', 'Predicted Indica'])
cm_df| Predicted Other | Predicted Indica | |
|---|---|---|
| Actual Other | 1461 | 157 |
| Actual Indica | 216 | 471 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 1000))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166edad90>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=1000,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166edad90>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=1000,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166edad90>)])<sklearn.compose._column_transformer.make_column_selector object at 0x166edad90>
StandardScaler()
LogisticRegression(C=1, l1_ratio=1, max_iter=1000, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")Average F1 Score across 225 cross validations: 0.6284981314081471
This is a score that is better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 1000, penalty = "elasticnet", C = 1, l1_ratio = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Indica'], columns=['Predicted Other', 'Predicted Indica'])
cm_df| Predicted Other | Predicted Indica | |
|---|---|---|
| Actual Other | 1435 | 183 |
| Actual Indica | 257 | 430 |
# Sativa vs. Not Sativa
y = np.where(data["Type"] == "sativa", 1, 0)# SVC
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x165f26d90>)])),
('SVC', SVC(C=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x165f26d90>)])),
('SVC', SVC(C=10))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x165f26d90>)])<sklearn.compose._column_transformer.make_column_selector object at 0x165f26d90>
StandardScaler()
SVC(C=10)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is significantly worse than random guesses.")Average F1 Score across 25 cross validations: 0.1920921810596639
This is a score that is significantly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 10))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Sativa'], columns=['Predicted Other', 'Predicted Sativa'])
cm_df| Predicted Other | Predicted Sativa | |
|---|---|---|
| Actual Other | 1843 | 31 |
| Actual Sativa | 96 | 335 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167feba90>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=0.1, max_iter=2500,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167feba90>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=0.1, max_iter=2500,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167feba90>)])<sklearn.compose._column_transformer.make_column_selector object at 0x167feba90>
StandardScaler()
LogisticRegression(C=1, l1_ratio=0.1, max_iter=2500, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")Average F1 Score across 225 cross validations: 0.39862439717664777
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = .1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Sativa'], columns=['Predicted Other', 'Predicted Sativa'])
cm_df| Predicted Other | Predicted Sativa | |
|---|---|---|
| Actual Other | 1781 | 93 |
| Actual Sativa | 286 | 145 |
# Hybrid vs. Not Hybrid
y = np.where(data["Type"] == "hybrid", 1, 0)# SVC
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167350150>)])),
('SVC', SVC(C=0.1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167350150>)])),
('SVC', SVC(C=0.1))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167350150>)])<sklearn.compose._column_transformer.make_column_selector object at 0x167350150>
StandardScaler()
SVC(C=0.1)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 25 cross validations: 0.6269445895255663
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = .1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Hybrid'], columns=['Predicted Other', 'Predicted Hybrid'])
cm_df| Predicted Other | Predicted Hybrid | |
|---|---|---|
| Actual Other | 441 | 677 |
| Actual Hybrid | 170 | 1017 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6d750>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6d750>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6d750>)])<sklearn.compose._column_transformer.make_column_selector object at 0x168a6d750>
StandardScaler()
LogisticRegression(C=1, l1_ratio=1, max_iter=2500, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 225 cross validations: 0.6380025018259865
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Hybrid'], columns=['Predicted Other', 'Predicted Hybrid'])
cm_df| Predicted Other | Predicted Hybrid | |
|---|---|---|
| Actual Other | 651 | 467 |
| Actual Hybrid | 343 | 844 |
Q2
The model that did the best job distinguishing the target category from the rest was the SVC model for hybrid vs other. The model that did the worst at distinguishing the target category from the rest was the logistic regression model for hybrid vs other. This makes intuitive sense because hybrid strains are the most difficult to distinguish and that is where the models struggled the most often, even if the best model did a good job of correctly identifying the hybrid strains.
Q3
is_data = data[(data["Type"] == "sativa" )| (data["Type"] == "indica")]
ih_data = data[(data["Type"] == "hybrid" )| (data["Type"] == "indica")]
hs_data = data[(data["Type"] == "hybrid" )| (data["Type"] == "sativa")]# Indica vs. Sativa
X = is_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = is_data["Type"]
y = LabelEncoder().fit_transform(y)# SVC
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2ec04d0>)])),
('SVC', SVC(C=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2ec04d0>)])),
('SVC', SVC(C=10))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x1a2ec04d0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x1a2ec04d0>
StandardScaler()
SVC(C=10)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")Average F1 Score across 25 cross validations: 0.7222126650132197
This is a score that is better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 10))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 676 | 11 |
| Actual Sativa | 26 | 405 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f63650>)])),
('LogReg',
LogisticRegression(C=2, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f63650>)])),
('LogReg',
LogisticRegression(C=2, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f63650>)])<sklearn.compose._column_transformer.make_column_selector object at 0x167f63650>
StandardScaler()
LogisticRegression(C=2, l1_ratio=1, max_iter=2500, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")Average F1 Score across 225 cross validations: 0.7868439457144402
This is a score that is better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 2, l1_ratio = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])
cm_df| Predicted Indica | Predicted Sativa | |
|---|---|---|
| Actual Indica | 620 | 67 |
| Actual Sativa | 75 | 356 |
# Indica vs. Hybrid
X = ih_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = ih_data["Type"]
y = LabelEncoder().fit_transform(y)# SVC
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6ea50>)])),
('SVC', SVC(C=1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6ea50>)])),
('SVC', SVC(C=1))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x168a6ea50>)])<sklearn.compose._column_transformer.make_column_selector object at 0x168a6ea50>
StandardScaler()
SVC(C=1)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 25 cross validations: 0.5707609076506064
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])
cm_df| Predicted Hybrid | Predicted Indica | |
|---|---|---|
| Actual Hybrid | 1029 | 158 |
| Actual Indica | 198 | 489 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166f3ccd0>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166f3ccd0>)])),
('LogReg',
LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x166f3ccd0>)])<sklearn.compose._column_transformer.make_column_selector object at 0x166f3ccd0>
StandardScaler()
LogisticRegression(C=1, l1_ratio=1, max_iter=2500, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")Average F1 Score across 225 cross validations: 0.6434071452695131
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = 1))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])
cm_df| Predicted Hybrid | Predicted Indica | |
|---|---|---|
| Actual Hybrid | 998 | 189 |
| Actual Indica | 238 | 449 |
# Hybrid vs. Sativa
X = hs_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = hs_data["Type"]
y = LabelEncoder().fit_transform(y)my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC())
])
alphas = {"SVC__C": [.1, .5, 1, 10, 100]}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167dcaf50>)])),
('SVC', SVC(C=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167dcaf50>)])),
('SVC', SVC(C=10))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167dcaf50>)])<sklearn.compose._column_transformer.make_column_selector object at 0x167dcaf50>
StandardScaler()
SVC(C=10)
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is significantly worse than random guesses.")Average F1 Score across 25 cross validations: 0.22301352303799224
This is a score that is significantly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("SVC", SVC(C = 10))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Sativa'])
cm_df| Predicted Hybrid | Predicted Sativa | |
|---|---|---|
| Actual Hybrid | 1156 | 31 |
| Actual Sativa | 93 | 338 |
# Logistic Regression
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])
alphas = {
"LogReg__penalty": ["elasticnet"],
"LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
"LogReg__C": list(range(1, 10))
}
gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f69c10>)])),
('LogReg',
LogisticRegression(C=9, l1_ratio=0.0001, max_iter=2500,
penalty='elasticnet', solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Preprocessing',
ColumnTransformer(transformers=[('standardize',
StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f69c10>)])),
('LogReg',
LogisticRegression(C=9, l1_ratio=0.0001, max_iter=2500,
penalty='elasticnet', solver='saga'))])ColumnTransformer(transformers=[('standardize', StandardScaler(),
<sklearn.compose._column_transformer.make_column_selector object at 0x167f69c10>)])<sklearn.compose._column_transformer.make_column_selector object at 0x167f69c10>
StandardScaler()
LogisticRegression(C=9, l1_ratio=0.0001, max_iter=2500, penalty='elasticnet',
solver='saga')print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")Average F1 Score across 225 cross validations: 0.40561046292482145
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
("Preprocessing", ct),
("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 9, l1_ratio = .0001))
])
fitted_pipeline = my_pipeline.fit(X, y)
y_pred = fitted_pipeline.predict(X)
cm = confusion_matrix(y_true = y, y_pred = y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])
cm_df| Predicted Hybrid | Predicted Indica | |
|---|---|---|
| Actual Hybrid | 1089 | 98 |
| Actual Indica | 273 | 158 |
Q4
The model that did the best at distinguishing between the two groups was the SVC model distinguishing between Indica and Sativa. The model that was the worst at distinguishing between the two groups was the logistic regression model distinguishing between Indica and Hybrid. This does make intuitive sense as Indica and Sativa should be the easiest for the model to distinguish between, due to them having different traits on average, while Hybrid and Indica can share many traits and Hybrid is the most difficult to accuractely categorize.
Q5
If you had input the full data with three classes into the LogisticRegression function, sklearn would have used OvR by default. For SVC, sklearn would use OvO by default.